---
title: "Descriptive analysis (subgroup)"
author: "Jae Yeon Kim"
date: "`r Sys.Date()`"
output:
  html_document:
    df_print: paged
---

# Load packages 

```{r}
if (!require(pacman)) install.packages("pacman")

pacman::p_load(
  tidyverse, # tidyverse
  panelr, # panel data analysis
  here, # computational reproducibility
  glue, # gluing objects and strings
  tidylog, # logging analysis
  naniar, # missing data
  readxl,
  ggpubr,
  broom,
  patchwork,
  plm,
  broom.mixed,
  estimatr,
  DeclareDesign,
  sensemakr,
  mice,
  testthat,
  modelsummary,
  flextable
)

source(here("functions/utils.r"))
source(here("functions/theme.R"))

ggplot2::theme_set(theme_bw())

# no scientific notation
options(scipen = 999)
```

# Load files 

```{r message = FALSE}
df <- read_csv(here("processed_data", "panel_data_recoded.csv"))
```

# Subgroup partition 

```{r}
# create a proxy variable
df$proxy <- rep(subset(df, wave == 1)$gendiscrim, 3)

test_that("Proxy setting done correctly", {
  expect_equal(table(df$proxy)[5] %>% as.numeric(), 186)
})

df$prior[df$proxy == 0.5] <- "Middle"
df$prior[df$proxy < 0.5] <- "Low"
df$prior[df$proxy > 0.5] <- "High"

# factorize

df <- df %>% 
  mutate(prior = factor(prior, levels = c("Low", "Middle", "High")))

## covariates
df$wave <- factor(df$wave)
```

# Visualize correlation matrix

## Biden total 

```{r}
density_plot <- df %>%
  filter(wave_fac %in% c("May", "November")) %>%
  mutate(wave_fac = glue("{wave_fac} 2020")) %>%
  ggplot(aes(x = apa.discrim.rona, fill = prior)) +
    geom_density(alpha = 0.3) +
    facet_wrap(~wave_fac) +
    labs(x = "COVID-19 Discrimination Perception",
         y = "Density", 
         fill = "Pre-Covid General Discrimination Perception") +
    theme(legend.position = "bottom") +
    geom_vline(xintercept = 0.5, linetype = "dotted", col = "red", size = 1)
```

## Correlation coefficients 

```{r}
cor_plot <- df %>% 
  filter(wave_fac %in% c("May", "November")) %>%
  group_by(prior, wave_fac) %>%
  dplyr::summarize("COVID-19 discrimination" = cor(biden, apa.discrim.rona, use = "complete.obs"),
            "Democratic partisanship" = cor(biden, DEM, use = "complete.obs")) %>%
  pivot_longer(cols = matches("COVID|Democratic"),
               names_to = "variables",
               values_to = "corr_coefs") %>%
  ggplot(aes(x = glue("{factor(wave_fac)} 2020"), y = corr_coefs, fill = variables)) +
  geom_col(position = "dodge2") +
  facet_wrap(~prior) +
  labs(y = "Correlation coefficient (r)", x = "",
       fill = "Correlation variables") +
  theme(legend.position = "bottom")
```
### Figure 3

```{r}
cor_plot

ggsave(here("outputs", "figure3.png"))
```

```{r}
subset(df, wave == 1)$prior %>% table()

df %>% 
  filter(wave != 1) %>%
  group_by(wave) %>%
  summarize(miss_biden = mean(is.na(biden)))

# second wave - 33%
# third wave - 27%
```

# Data summaries 

## Table 3

```{r}
df %>%
  filter(wave == 1) %>%
  group_by(prior) %>%
  count()
```

```{r}
datasummary(
  (`Income` = income) + 
  (`Education` = edu) + 
  (`Male` = male) + 
  (`Chinese` = chinese) + 
  (`Filipino` = filipino) + 
  (`Indian` = indian) + 
  (`Japanese` = japanese) + 
  (`Korean` = korean) + 
  (`Vietnamese` = vietnamese) + 
  (`Democrat` = DEM) + 
  (`Republican` = GOP) ~ prior * (Mean + SD),
  data = df %>%
    filter(wave == 1) %>%
    select(income, edu, usborn, male, DEM, GOP, 
           chinese, indian, 
           filipino, vietnamese,
           korean, japanese,  
           prior),
  fmt = 1, 
  sparse_header = TRUE,
  note = "Only the first wave respondents were included",
  output = here("outputs", "table3.docx"))
```

## Table C1

```{r}
table_df <- df %>%
  dplyr::summarize(female = 1 - mean(male),
            for_born = 1 - mean(usborn),
            edu_level = mean(edu),
            income_level = mean(income),
            chinese_pct = mean(if_else(str_detect(as.character(natorigin), "1"), 1, 0), na.rm = T),
            japanese_pct = mean(if_else(str_detect(as.character(natorigin), "6"), 1, 0), na.rm = T),
            filipino_pct = mean(if_else(str_detect(as.character(natorigin), "4"), 1, 0), na.rm = T),
            korean_pct = mean(if_else(str_detect(as.character(natorigin), "3"), 1, 0), na.rm = T),
            vietnamese_pct = mean(if_else(str_detect(as.character(natorigin), "5"), 1, 0), na.rm = T),
            indian_pct = mean(if_else(str_detect(as.character(natorigin), "2"), 1, 0), na.rm = T)) %>% 
  pivot_longer(cols = everything(), 
               names_to = "Variables",
               values_to = "Percentage") 

table_df[-c(3:4),]$Percentage <- round(table_df[-c(3:4),]$Percentage, 2)*100

table_df$Percentage[table_df$Variables == "edu_level"] <- "4 year degree"
table_df$Percentage[table_df$Variables == "income_level"] <- "$70K - $79,999"

table_df[-c(3:4),]$Percentage <- paste0(table_df[-c(3:4),]$Percentage, "%")

names(table_df)[2] <- "Percentage/Average"

table_df$Variables <- c("% Female", "% Foreign born", "Education level", "Income level", "% Chinese", "% Japanese", "% Fipipino", "% Korean", "% Vietnamese", "% Indian")

sum_table <- regulartable(table_df) %>%
  theme_booktabs() %>%
  autofit()

sum_table
```

```{r}
save_as_docx(sum_table, path = here("outputs", "table_c1.docx"))
```